K 均值聚类

In [1]:
import numpy as np
from numpy.linalg import cholesky

import plotly
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode,iplot
init_notebook_mode(connected=True)
import warnings
warnings.filterwarnings('ignore')
import plotly
import plotly.offline as py
py.init_notebook_mode(connected=False)
import random
from tqdm import tqdm_notebook
import plotly.offline.offline as of

from sklearn.metrics.cluster import normalized_mutual_info_score as NMI ##评价指标

1 数据生成

In [37]:
Sigma=np.array([[1,0],[0,1]])
R=cholesky(Sigma)
mu1=np.array([1,-1])
mu2=np.array([5.5,-4.5])
mu3=np.array([1,4])
mu4=np.array([6,4.5])
mu5=np.array([9,0])
mu=np.vstack((mu1,mu2,mu3,mu4,mu5))
mu=np.sort(mu)

x1=np.concatenate([np.dot(np.random.randn(500, 2), R) + mu1,np.zeros((500,1))],axis=1)

x2=np.concatenate([np.dot(np.random.randn(500, 2), R) + mu2,np.ones((500,1))*1],axis=1)
x3=np.concatenate([np.dot(np.random.randn(500, 2), R) + mu3,np.ones((500,1))*2],axis=1)
x4=np.concatenate([np.dot(np.random.randn(500, 2), R) + mu4,np.ones((500,1))*3],axis=1)
x5=np.concatenate([np.dot(np.random.randn(500, 2), R) + mu5,np.ones((500,1))*4],axis=1)

data=np.vstack((x1,x2,x3,x4,x5))
np.savetxt('data',data)

fig = go.Figure()

trace = go.Scatter(
x = x1[:,0],y = x1[:,1],
    mode='markers',
name ="$id = {}$".format(1))
fig.add_trace(trace)

trace = go.Scatter(
x = x2[:,0],y = x2[:,1],
mode='markers',
name ="$id = {}$".format(2))
fig.add_trace(trace)

trace = go.Scatter(
x = x3[:,0],y = x3[:,1],
    mode='markers',
name ="$id = {}$".format(3))
fig.add_trace(trace)

trace = go.Scatter(
x = x4[:,0],y = x4[:,1],
    mode='markers',
name ="$id = {}$".format(4))
fig.add_trace(trace)

trace = go.Scatter(
x = x5[:,0],y = x5[:,1],
    mode='markers',
name ="$id = {}$".format(5))
fig.add_trace(trace)
fig.update_layout(
    xaxis_title='$x_1$',
    yaxis_title='$x_2$',
)
fig.show()

2 模型实现

In [38]:
class K_Means(object):
    # k是分组数;tolerance‘中心点误差’;max_iter是迭代次数
    def __init__(self, k=2, tolerance=0.0001, max_iter=500):
        self.k_ = k
        self.tolerance_ = tolerance
        self.max_iter_ = max_iter

    def fit(self, data, c = None):
        if not c:
            ## 随机初始化
            self.centers_ = {}
            for i in range(self.k_):
                self.centers_[i] = data[i]
        else:
            self.centers_ = c

        for j in range(self.max_iter_):
            self.clf_ = {}
            for i in range(self.k_):
                self.clf_[i] = []
            for feature in data:
                distances = []
                for center in self.centers_:
                    # 欧拉距离
                    # np.sqrt(np.sum((features-self.centers_[center])**2))
                    distances.append(np.linalg.norm(feature - self.centers_[center]))
                classification = distances.index(min(distances))
                self.clf_[classification].append(feature)

            # print("分组情况:",self.clf_)
            prev_centers = dict(self.centers_)
            for c in self.clf_:
                self.centers_[c] = np.average(self.clf_[c], axis=0)

            # '中心点'是否在误差范围
            optimized = True
            for center in self.centers_:
                org_centers = prev_centers[center]
                cur_centers = self.centers_[center]
                if np.sum((cur_centers - org_centers) / org_centers * 100.0) > self.tolerance_:
                    optimized = False
            if optimized:
                break
            
        return j

    def predict(self, p_data):
        distances = [np.linalg.norm(p_data - self.centers_[center]) for center in self.centers_]
        index = distances.index(min(distances))
        return index

3 初始化比较

In [81]:
epoch = 10
NMI_random = []
NMI_center = []
iters_random = []
iters_center = []
In [82]:
for _ in range(epoch):
    model = K_Means(k = 5 ,max_iter=1000) ## 已知5
    np.random.shuffle(data)
    iters = model.fit(data)
    result = []
    for d in data:
        result.append(model.predict(d))
    result = np.array(result)
    iters_random.append(iters)
    NMI_random.append(NMI(result,data[:,-1]))
In [83]:
xs = [data[np.where(result==i)] for i in range(5)]
fig = go.Figure()

for i in range(5):
    trace = go.Scatter(
    x = xs[i][:,0],y = xs[i][:,1],
        mode='markers',
    name ="$id = {}$".format(i+1))
    fig.add_trace(trace)

fig.update_layout(
xaxis_title='$x_1$',
yaxis_title='$x_2$',
)
fig.show()
In [84]:
iters_random
Out[84]:
[19, 16, 3, 19, 4, 4, 10, 3, 12, 30]
In [85]:
NMI_random
Out[85]:
[0.9845563818231768,
 0.809430290060044,
 0.9860034183151838,
 0.8098895908478834,
 0.9845563818231767,
 0.9860034183151837,
 0.8099135921548994,
 0.9845563818231767,
 0.9860034183151838,
 0.7868974811674548]
In [86]:
data[:,:2].shape
Out[86]:
(2500, 2)
In [87]:
for _ in range(epoch):
    model = K_Means(k = 5 ,max_iter=1000) ## 已知5

    np.random.shuffle(data)
    iters = model.fit(data[:,:2],{0:mu1,1:mu2,2:mu3,3:mu4,4:mu5})
    result = []
    for d in data[:,:2]:
        result.append(model.predict(d))
    result = np.array(result)
    iters_center.append(iters)
    NMI_center.append(NMI(result,data[:,-1]))
In [88]:
xs = [data[np.where(result==i)] for i in range(5)]
fig = go.Figure()

for i in range(5):
    trace = go.Scatter(
    x = xs[i][:,0],y = xs[i][:,1],
        mode='markers',
    name ="$id = {}$".format(i+1))
    fig.add_trace(trace)

fig.update_layout(
xaxis_title='$x_1$',
yaxis_title='$x_2$',
)
fig.show()
In [89]:
iters_center
Out[89]:
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
In [90]:
NMI_center
Out[90]:
[0.9739963735920177,
 0.9739963735920177,
 0.9739963735920177,
 0.9739963735920177,
 0.9739963735920177,
 0.9739963735920177,
 0.9739963735920177,
 0.9739963735920177,
 0.9739963735920177,
 0.9739963735920177]
In [98]:
fig = go.Figure()

trace = go.Scatter(
x = np.arange(0,epoch,1),y=iters_center,
name ="$center$")
fig.add_trace(trace)

trace = go.Scatter(
x = np.arange(0,epoch,1),y=iters_random,
name ="$random$")
fig.add_trace(trace)

fig.update_layout(
    yaxis_title='$iter\_num$',
    xaxis_title='$times$',
)
fig.show()
In [99]:
fig = go.Figure()

trace = go.Scatter(
x = np.arange(0,epoch,1),y=NMI_center,
name ="$center$")
fig.add_trace(trace)

trace = go.Scatter(
x = np.arange(0,epoch,1),y=NMI_random,
name ="$random$")
fig.add_trace(trace)

fig.update_layout(
    yaxis_title='$NMI$',
    xaxis_title='$times$',
)
fig.show()

4 K 的影响

In [93]:
k = 10
model = K_Means(k = k ,max_iter=1000) ## 已知5
np.random.shuffle(data)
iters = model.fit(data)
result = []
for d in data:
    result.append(model.predict(d))
result = np.array(result)
print(iters)
17
In [94]:
xs = [data[np.where(result==i)] for i in range(k)]
fig = go.Figure()

for i in range(k):
    trace = go.Scatter(
    x = xs[i][:,0],y = xs[i][:,1],
        mode='markers',
    name ="$id = {}$".format(i+1))
    fig.add_trace(trace)

fig.update_layout(
xaxis_title='$x_1$',
yaxis_title='$x_2$',
)
fig.show()
In [95]:
k = 2
model = K_Means(k = k ,max_iter=1000) ## 已知5
np.random.shuffle(data)
iters = model.fit(data)
result = []
for d in data:
    result.append(model.predict(d))
result = np.array(result)
print(iters)
0
In [96]:
xs = [data[np.where(result==i)] for i in range(k)]
fig = go.Figure()

for i in range(k):
    trace = go.Scatter(
    x = xs[i][:,0],y = xs[i][:,1],
        mode='markers',
    name ="$id = {}$".format(i+1))
    fig.add_trace(trace)

fig.update_layout(
xaxis_title='$x_1$',
yaxis_title='$x_2$',
)
fig.show()
In [63]:
ks = [2,3,5,6,10,50]
avg = 3
NMIs = []
iters_time = []

for k in ks:
    avgs_NMI = []
    avgs_iter = []
    
    for _ in range(avg):
        
        model = K_Means(k = k ,max_iter=1000) ## 已知5
        np.random.shuffle(data)
        iters = model.fit(data)
        result = []
        for d in data:
            result.append(model.predict(d))
        result = np.array(result)
        
        avgs_iter.append(iters)
        avgs_NMI.append(NMI(result,data[:,-1]))
    NMIs.append(sum(avgs_NMI)/len(avgs_NMI))
    iters_time.append(sum(avgs_iter)/len(avgs_iter))
In [64]:
NMIs
Out[64]:
[0.6206308446724286,
 0.7238215850621891,
 0.9860034183151837,
 0.9427871306167654,
 0.8519030558888433,
 0.6456092997867772]
In [65]:
iters_time
Out[65]:
[10.0,
 8.333333333333334,
 4.666666666666667,
 6.666666666666667,
 24.666666666666668,
 22.0]
In [68]:
fig = go.Figure()

trace = go.Scatter(
x = ks,y=NMIs,
name ="$NMIs$")
fig.add_trace(trace)



fig.update_layout(
    xaxis_title='$k$',
    yaxis_title='$avg\_NMI$',
)
fig.show()
In [71]:
fig = go.Figure()

trace = go.Scatter(
x = ks,y=iters_time,
name ="$NMIs$")
fig.add_trace(trace)



fig.update_layout(
    xaxis_title='$k$',
    yaxis_title='$avg\_iters$',
)
fig.show()